Импорт phyloseq объекта и библиотек
Датасет - хроносерия разложения соломы - от фактор Day 10ти уровней
library(phyloseq)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggpubr)
library(ampvis2)
library(ANCOMBC)
library(heatmaply)
## Loading required package: plotly
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
##
## Loading required package: viridis
## Loading required package: viridisLite
## Registered S3 method overwritten by 'dendextend':
## method from
## rev.hclust vegan
## Registered S3 methods overwritten by 'registry':
## method from
## print.registry_field proxy
## print.registry_entry proxy
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust vegan
##
## ======================
## Welcome to heatmaply version 1.3.0
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags:
## https://stackoverflow.com/questions/tagged/heatmaply
## ======================
library(WGCNA)
## Loading required package: dynamicTreeCut
## Loading required package: fastcluster
##
## Attaching package: 'fastcluster'
##
## The following object is masked from 'package:stats':
##
## hclust
##
##
##
## Attaching package: 'WGCNA'
##
## The following object is masked from 'package:stats':
##
## cor
library(compositions)
## Welcome to compositions, a package for compositional data analysis.
## Find an intro with "? compositions"
##
##
## Attaching package: 'compositions'
##
## The following object is masked from 'package:WGCNA':
##
## cor
##
## The following object is masked from 'package:heatmaply':
##
## normalize
##
## The following objects are masked from 'package:stats':
##
## anova, cor, cov, dist, var
##
## The following objects are masked from 'package:base':
##
## %*%, norm, scale, scale.default
library(igraph)
##
## Attaching package: 'igraph'
##
## The following object is masked from 'package:compositions':
##
## normalize
##
## The following object is masked from 'package:heatmaply':
##
## normalize
##
## The following object is masked from 'package:plotly':
##
## groups
##
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
##
## The following objects are masked from 'package:purrr':
##
## compose, simplify
##
## The following object is masked from 'package:tidyr':
##
## crossing
##
## The following object is masked from 'package:tibble':
##
## as_data_frame
##
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
##
## The following object is masked from 'package:base':
##
## union
ps.f <- readRDS("psf2")
#phyloseq object to ampvis2 object
#https://gist.github.com/KasperSkytte/8d0ca4206a66be7ff6d76fc4ab8e66c6
phyloseq_to_ampvis2 <- function(physeq) {
#check object for class
if(!any(class(physeq) %in% "phyloseq"))
stop("physeq object must be of class \"phyloseq\"", call. = FALSE)
#ampvis2 requires taxonomy and abundance table, phyloseq checks for the latter
if(is.null(physeq@tax_table))
stop("No taxonomy found in the phyloseq object and is required for ampvis2", call. = FALSE)
#OTUs must be in rows, not columns
if(phyloseq::taxa_are_rows(physeq))
abund <- as.data.frame(phyloseq::otu_table(physeq)@.Data)
else
abund <- as.data.frame(t(phyloseq::otu_table(physeq)@.Data))
#tax_table is assumed to have OTUs in rows too
tax <- phyloseq::tax_table(physeq)@.Data
#merge by rownames (OTUs)
otutable <- merge(
abund,
tax,
by = 0,
all.x = TRUE,
all.y = FALSE,
sort = FALSE
)
colnames(otutable)[1] <- "OTU"
#extract sample_data (metadata)
if(!is.null(physeq@sam_data)) {
metadata <- data.frame(
phyloseq::sample_data(physeq),
row.names = phyloseq::sample_names(physeq),
stringsAsFactors = FALSE,
check.names = FALSE
)
#check if any columns match exactly with rownames
#if none matched assume row names are sample identifiers
samplesCol <- unlist(lapply(metadata, function(x) {
identical(x, rownames(metadata))}))
if(any(samplesCol)) {
#error if a column matched and it's not the first
if(!samplesCol[[1]])
stop("Sample ID's must be in the first column in the sample metadata, please reorder", call. = FALSE)
} else {
#assume rownames are sample identifiers, merge at the end with name "SampleID"
if(any(colnames(metadata) %in% "SampleID"))
stop("A column in the sample metadata is already named \"SampleID\" but does not seem to contain sample ID's", call. = FALSE)
metadata$SampleID <- rownames(metadata)
#reorder columns so SampleID is the first
metadata <- metadata[, c(which(colnames(metadata) %in% "SampleID"), 1:(ncol(metadata)-1L)), drop = FALSE]
}
} else
metadata <- NULL
#extract phylogenetic tree, assumed to be of class "phylo"
if(!is.null(physeq@phy_tree)) {
tree <- phyloseq::phy_tree(physeq)
} else
tree <- NULL
#extract OTU DNA sequences, assumed to be of class "XStringSet"
if(!is.null(physeq@refseq)) {
#convert XStringSet to DNAbin using a temporary file (easiest)
fastaTempFile <- tempfile(pattern = "ampvis2_", fileext = ".fa")
Biostrings::writeXStringSet(physeq@refseq, filepath = fastaTempFile)
} else
fastaTempFile <- NULL
#load as normally with amp_load
ampvis2::amp_load(
otutable = otutable,
metadata = metadata,
tree = tree,
fasta = fastaTempFile
)
}
#variance stabilisation from DESeq2
ps_vst <- function(ps, factor){
require(DESeq2)
require(phyloseq)
diagdds = phyloseq_to_deseq2(ps, as.formula(paste( "~", factor)))
diagdds = estimateSizeFactors(diagdds, type="poscounts")
diagdds = estimateDispersions(diagdds, fitType = "local")
pst <- varianceStabilizingTransformation(diagdds)
pst.dimmed <- t(as.matrix(assay(pst)))
# pst.dimmed[pst.dimmed < 0.0] <- 0.0
ps.varstab <- ps
otu_table(ps.varstab) <- otu_table(pst.dimmed, taxa_are_rows = FALSE)
return(ps.varstab)
}
#WGCNA visualisation
#result - list class object with attributes:
# ps - phyloseq object
# amp - ampwis2 object
# heat - heatmap with absalute read numbers
# heat_rel - hetmap with relative abundances
# tree - phylogenetic tree with taxonomy
color_filt <- function(ps, df){
library(tidyverse)
library(reshape2)
library(gridExtra)
l = list()
for (i in levels(df$module)){
message(i)
color_name <- df %>%
filter(module == i) %>%
pull(asv) %>%
unique()
ps.col <- prune_taxa(color_name, ps)
amp.col <- phyloseq_to_amp(ps.col)
heat <- amp_heatmap(amp.col, tax_show = 60,
group_by = "Day",
tax_aggregate = "OTU",
tax_add = "Genus",
normalise=FALSE,
showRemainingTaxa = TRUE)
ps.rel <- phyloseq::transform_sample_counts(ps.col, function(x) x / sum(x) * 100)
amp.r <- phyloseq_to_ampvis2(ps.rel)
heat.rel <- amp_heatmap(amp.r, tax_show = 60,
group_by = "Day",
tax_aggregate = "OTU",
tax_add = "Genus",
normalise=FALSE,
showRemainingTaxa = TRUE)
tree <- ps.col@phy_tree
taxa <- as.data.frame(ps.col@tax_table@.Data)
p1 <- ggtree(tree) +
geom_tiplab(size=2, align=TRUE, linesize=.5) +
theme_tree2()
taxa[taxa == "Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium"] <- "Allorhizobium"
taxa[taxa == "Burkholderia-Caballeronia-Paraburkholderia"] <- "Burkholderia"
tx <- taxa %>%
rownames_to_column("id") %>%
mutate(id = factor(id, levels = rev(get_taxa_name(p1)))) %>%
dplyr::select(-c(Kingdom, Species, Order)) %>%
melt(id.var = 'id')
p2 <- ggplot(tx, aes(variable, id)) +
geom_tile(aes(fill = value), alpha = 0.4) +
geom_text(aes(label = value), size = 2) +
theme_bw() +
theme(legend.position = "none",
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank())
p <- ggpubr::ggarrange(p1, p2, widths = c(0.9, 1))
l[[i]] <- list("ps" = ps.col,
"amp" = amp.col,
"heat" = heat,
"heat_rel" = heat.rel,
"tree" = p)
}
return(l)
}
Разделим датасет на две группы
1-я - в более чем 10% образцов должно быть хотя бы 10 ридов - эта группа пойдет в анализ далее
ps.inall <- phyloseq::filter_taxa(ps.f, function(x) sum(x > 10) > (0.1*length(x)), TRUE)
amp.inall <- phyloseq_to_ampvis2(ps.inall)
amp_heatmap(amp.inall,
tax_show = 40,
group_by = "Day",
tax_aggregate = "OTU",
tax_add = "Genus",
normalise=FALSE,
showRemainingTaxa = TRUE)
Вторая - оставшиеся, но более 100 ридов по всем образцам(далее -
вылетающие мажоры(ВМ))
Остальные филотипы выкидываем из анализа
ps.exl <- phyloseq::filter_taxa(ps.f, function(x) sum(x > 10) < (0.1*length(x)), TRUE)
ps.exl <- prune_taxa(taxa_sums(ps.exl) > 100, ps.exl)
amp.exl <- phyloseq_to_ampvis2(ps.exl)
amp_heatmap(amp.exl,
tax_show = 40,
group_by = "Day",
tax_aggregate = "OTU",
tax_add = "Genus",
normalise=FALSE,
showRemainingTaxa = TRUE)
То же, но c относительной представленностью.
ps.per <- phyloseq::transform_sample_counts(ps.f, function(x) x / sum(x) * 100)
ps.exl.taxa <- taxa_names(ps.exl)
ps.per.exl <- prune_taxa(ps.exl.taxa, ps.per)
amp.exl.r <- phyloseq_to_ampvis2(ps.per.exl)
amp_heatmap(amp.exl.r, tax_show = 60,
group_by = "Day",
tax_aggregate = "OTU",
tax_add = "Genus",
round = 2,
normalise=FALSE,
showRemainingTaxa = TRUE)
amp_heatmap(amp.exl.r, tax_show = 60,
group_by = "Day",
tax_aggregate = "OTU",
tax_add = "Genus",
round = 2,
normalise=FALSE, )
ВМ объединенные по родам. Относительная представленность.
amp_heatmap(amp.exl.r,
tax_show = 30,
group_by = "Day",
tax_aggregate = "Genus",
tax_add = "Phylum",
tax_class = "Proteobacteria",
round = 2,
normalise=FALSE,
showRemainingTaxa = TRUE)
Если посмотреть, если как распределяются ВМ по дням - похоже распределение ВМ связано не только с особенностью отдельных мешочков, но и с чисто техническими особенностями - вылетающие значения вылетают в основном не с биологическими повторностями, а с техническими(красная линия)
p_box <- phyloseq::sample_sums(ps.per.exl) %>%
as.data.frame(col.names = "values") %>%
setNames(., nm = "values") %>%
rownames_to_column("samples") %>%
mutate(Day = sapply(strsplit(samples, "-"), `[`, 3)) %>%
ggplot(aes(x=Day, y=values, color=Day, fill = Day)) +
geom_boxplot(aes(color=Day, fill = Day)) +
geom_point(color = "black", position = position_dodge(width=0.2)) +
geom_hline(yintercept = 10, colour = "red") +
theme_bw() +
theme(legend.position = "none")
p_box <- p_box + viridis::scale_color_viridis(option = "H", discrete = TRUE, direction=1, begin=0.1, end = 0.9, alpha = 0.5)
p_box + viridis::scale_fill_viridis(option = "H", discrete = TRUE, direction=1, begin=0.1, end = 0.9, alpha = 0.3)
после clr нормализации
otu.inall <- as.data.frame(ps.inall@otu_table@.Data)
ps.inall.clr <- ps.inall
otu_table(ps.inall.clr) <- phyloseq::otu_table(compositions::clr(otu.inall), taxa_are_rows = FALSE)
data <- ps.inall.clr@otu_table@.Data %>%
as.data.frame()
rownames(data) <- as.character(ps.inall.clr@sam_data$Description)
powers <- c(c(1:10), seq(from = 12, to=30, by=1))
sft <- pickSoftThreshold(data, powerVector = powers, verbose = 5, networkType = "signed")
## pickSoftThreshold: will use block size 338.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 338 of 338
## Warning: executing %dopar% sequentially: no parallel backend registered
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.185 23.60 0.8790 168.0000 168.00000 172.000
## 2 2 0.199 -15.50 0.8970 87.6000 87.20000 95.100
## 3 3 0.518 -13.10 0.8580 47.3000 46.80000 55.900
## 4 4 0.390 -6.46 0.8050 26.5000 26.00000 34.500
## 5 5 0.262 -3.46 0.7820 15.4000 14.90000 22.300
## 6 6 0.232 -2.30 0.7770 9.2300 8.78000 15.000
## 7 7 0.251 -1.87 0.6990 5.7300 5.36000 10.500
## 8 8 0.319 -1.75 0.7570 3.6800 3.41000 7.620
## 9 9 0.527 -2.07 0.7770 2.4400 2.20000 5.890
## 10 10 0.648 -2.28 0.8490 1.6600 1.47000 4.690
## 11 12 0.827 -2.41 0.8970 0.8420 0.69300 3.190
## 12 13 0.793 -2.38 0.7930 0.6220 0.49000 2.700
## 13 14 0.277 -3.97 0.0914 0.4690 0.35500 2.320
## 14 15 0.282 -3.83 0.1000 0.3610 0.26100 2.010
## 15 16 0.894 -2.17 0.9030 0.2820 0.19800 1.770
## 16 17 0.905 -2.10 0.9080 0.2240 0.15000 1.560
## 17 18 0.922 -2.04 0.9240 0.1810 0.11500 1.390
## 18 19 0.927 -2.01 0.9230 0.1480 0.08820 1.250
## 19 20 0.938 -1.93 0.9370 0.1220 0.06780 1.130
## 20 21 0.946 -1.86 0.9470 0.1020 0.05260 1.020
## 21 22 0.315 -3.12 0.2220 0.0856 0.04140 0.926
## 22 23 0.303 -3.96 0.1560 0.0728 0.03270 0.844
## 23 24 0.307 -2.92 0.1940 0.0623 0.02590 0.772
## 24 25 0.303 -2.84 0.1800 0.0538 0.02060 0.708
## 25 26 0.921 -1.63 0.8990 0.0467 0.01650 0.657
## 26 27 0.892 -1.61 0.8650 0.0408 0.01320 0.622
## 27 28 0.155 -1.93 -0.0202 0.0359 0.01040 0.591
## 28 29 0.199 -2.77 -0.0228 0.0317 0.00841 0.563
## 29 30 0.199 -2.69 -0.0203 0.0282 0.00684 0.537
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,cex=0.9,col="red")
abline(h=0.9,col="salmon")
после vst нормализации
ps.varstab <- ps_vst(ps.inall, "Day")
data2 <- ps.varstab@otu_table@.Data %>%
as.data.frame()
rownames(data2) <- as.character(ps.varstab@sam_data$Description)
powers <- c(seq(from = 1, to=10, by=0.5), seq(from = 11, to=20, by=1))
sft2 <- pickSoftThreshold(data2, powerVector = powers, verbose = 5, networkType = "signed hybrid")
## pickSoftThreshold: will use block size 338.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 338 of 338
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1.0 0.409 -1.05 0.704 50.100 45.60000 97.80
## 2 1.5 0.578 -1.07 0.895 30.900 27.00000 71.20
## 3 2.0 0.723 -1.15 0.942 20.300 17.90000 53.60
## 4 2.5 0.767 -1.24 0.871 14.000 12.00000 41.70
## 5 3.0 0.809 -1.31 0.909 10.000 7.86000 33.40
## 6 3.5 0.872 -1.31 0.958 7.380 5.39000 27.30
## 7 4.0 0.888 -1.30 0.968 5.580 3.76000 22.60
## 8 4.5 0.858 -1.36 0.910 4.320 2.67000 19.00
## 9 5.0 0.892 -1.36 0.943 3.400 1.94000 16.20
## 10 5.5 0.899 -1.38 0.954 2.720 1.41000 13.90
## 11 6.0 0.896 -1.39 0.962 2.210 1.07000 12.10
## 12 6.5 0.897 -1.33 0.962 1.820 0.84700 10.60
## 13 7.0 0.910 -1.30 0.970 1.520 0.65400 9.30
## 14 7.5 0.914 -1.31 0.966 1.280 0.51500 8.25
## 15 8.0 0.913 -1.28 0.954 1.090 0.41500 7.36
## 16 8.5 0.905 -1.28 0.937 0.938 0.34000 6.59
## 17 9.0 0.895 -1.30 0.913 0.813 0.27800 5.99
## 18 9.5 0.878 -1.33 0.900 0.709 0.23200 5.49
## 19 10.0 0.886 -1.34 0.894 0.624 0.19100 5.07
## 20 11.0 0.911 -1.31 0.928 0.491 0.12500 4.35
## 21 12.0 0.890 -1.29 0.894 0.396 0.08590 3.79
## 22 13.0 0.910 -1.26 0.918 0.325 0.06120 3.33
## 23 14.0 0.882 -1.25 0.856 0.272 0.04420 2.95
## 24 15.0 0.889 -1.19 0.864 0.230 0.03230 2.63
## 25 16.0 0.913 -1.18 0.893 0.198 0.02240 2.35
## 26 17.0 0.930 -1.16 0.912 0.172 0.01620 2.20
## 27 18.0 0.955 -1.19 0.947 0.151 0.01190 2.09
## 28 19.0 0.908 -1.21 0.896 0.134 0.00896 1.99
## 29 20.0 0.916 -1.21 0.906 0.120 0.00649 1.91
plot(sft2$fitIndices[,1], -sign(sft2$fitIndices[,3])*sft2$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence"))
text(sft2$fitIndices[,1], -sign(sft2$fitIndices[,3])*sft2$fitIndices[,2], labels=powers,cex=0.9,col="red")
abline(h=0.9,col="salmon")
ordination
net <- blockwiseModules(data, power=3, TOMType="unsigned", networkType="signed")
modules_of_interest <- c("green", "turquoise", "tan")
mergedColors <- labels2colors(net$colors)
modules_of_interest <- mergedColors2 %>% unique()
df <- module_df %>%
rename("id" = "asv")
df <- df %>%
dplyr::select(-"id") %>%
mutate(colors = as.factor(colors))
ps.inall.col <- ps.inall
tx <- cbind(taxa, df)
tax_table(ps.inall.col) <- tax_table(as.matrix(tx))
ord <- ordinate(ps.inall.col, "NMDS", "unifrac")
plot_ordination(ps.inall.col, ord, type = "species", color = "colors") +
scale_color_manual(values = c("blue", "brown", "green", "salmon", "turquoise")) +
theme_bw()
ordination chunk
ps.inall.col <- ps.inall
tx <- cbind(taxa, df)
ps.inall.col@tax_table
tax_table(ps.inall.col) <- tax_table(as.matrix(tx))
ord <- ordinate(ps.inall.col, "NMDS", "unifrac")
plot_ordination(ps.inall.col, ord, type = "species", color = "colors") +
scale_color_manual(values = c("blue", "brown", "green", "salmon", "turquoise")) +
theme_bw()